Keplan Meier

Author

Lianghui Yi

Code
# read in the data
data <- read.csv("../data/derived-data/PFS_Merged_Cleaned_factored.csv")
head(data)
A data.frame: 6 x 28
X STUDYID USUBJID CNSR TRT01P PPROTFL STARTDT ADT AVAL EVNTDESC ... RACE ETHNIC LDH ALB HGB EXTRT EXDOSE BMI ECOG CRP
<int> <chr> <chr> <int> <chr> <chr> <chr> <chr> <int> <chr> ... <chr> <chr> <dbl> <dbl> <dbl> <chr> <int> <dbl> <int> <dbl>
1 1 54767414MMY3008 SUBJ0001 0 TREAT A Y 2019-04-22 2021-01-12 631 DEATH ... White Not Hispanic or Latino 139.7219 155.2226 135.7905 Dexamethasone 110 26.83260 4 12.830053
2 2 54767414MMY3008 SUBJ0002 0 TREAT A Y 2021-10-13 2024-04-04 904 DEATH ... White Not Hispanic or Latino 115.6110 128.2108 131.7716 Daratumumab 30 25.42065 1 6.702462
3 3 54767414MMY3008 SUBJ0003 0 TREAT B Y 2020-10-31 2020-11-30 30 DEATH ... White Not Hispanic or Latino 172.8269 159.3242 142.5922 Dexamethasone 60 32.02936 2 11.959545
4 4 54767414MMY3008 SUBJ0004 1 TREAT A Y 2020-12-31 2021-09-25 268 ANALYSIS CUT OFF ... White Not Hispanic or Latino 144.8748 171.4453 129.9159 Lenalidomide 150 22.04537 1 2.101916
5 5 54767414MMY3008 SUBJ0005 1 TREAT A Y 2017-10-26 2018-07-13 260 LAST CONTACT ... White Not Hispanic or Latino 161.9881 131.3127 164.3331 Daratumumab 130 20.38940 1 7.230141
6 6 54767414MMY3008 SUBJ0006 0 TREAT B Y 2017-02-02 2017-12-13 314 DEATH ... White Not Hispanic or Latino 175.1388 141.2525 159.1366 Lenalidomide 20 21.33275 1 6.671318
Code
library(survival)
library(survminer)
library(ggplot2)
library(dplyr)          # For data manipulation

# Prepare the data
# CNSR: Event indicator (0 = event occurred, 1 = censored)
data$CNSR <- as.numeric(data$CNSR)
# AVAL: Time to event
data$AVAL <- as.numeric(data$AVAL)
# TRT01P: Treatment Method
data$TRT01P <- as.factor(data$TRT01P)
Code
# show all the feature columns names
names(data)

str(data)
  1. 'X'
  2. 'STUDYID'
  3. 'USUBJID'
  4. 'CNSR'
  5. 'TRT01P'
  6. 'PPROTFL'
  7. 'STARTDT'
  8. 'ADT'
  9. 'AVAL'
  10. 'EVNTDESC'
  11. 'RSSTRESC'
  12. 'TOTAE'
  13. 'SAECOUNT'
  14. 'AEDECOD_MS'
  15. 'AEBODSYS'
  16. 'AESER'
  17. 'AGE'
  18. 'SEX'
  19. 'RACE'
  20. 'ETHNIC'
  21. 'LDH'
  22. 'ALB'
  23. 'HGB'
  24. 'EXTRT'
  25. 'EXDOSE'
  26. 'BMI'
  27. 'ECOG'
  28. 'CRP'
'data.frame':   683 obs. of  28 variables:
 $ X         : int  1 2 3 4 5 6 7 8 9 10 ...
 $ STUDYID   : chr  "54767414MMY3008" "54767414MMY3008" "54767414MMY3008" "54767414MMY3008" ...
 $ USUBJID   : chr  "SUBJ0001" "SUBJ0002" "SUBJ0003" "SUBJ0004" ...
 $ CNSR      : num  0 0 0 1 1 0 0 0 1 0 ...
 $ TRT01P    : Factor w/ 2 levels "TREAT A","TREAT B": 1 1 2 1 1 2 1 2 1 1 ...
 $ PPROTFL   : chr  "Y" "Y" "Y" "Y" ...
 $ STARTDT   : chr  "2019-04-22" "2021-10-13" "2020-10-31" "2020-12-31" ...
 $ ADT       : chr  "2021-01-12" "2024-04-04" "2020-11-30" "2021-09-25" ...
 $ AVAL      : num  631 904 30 268 260 314 347 691 180 296 ...
 $ EVNTDESC  : chr  "DEATH" "DEATH" "DEATH" "ANALYSIS CUT OFF" ...
 $ RSSTRESC  : chr  "Stable Disease" "Progressive Disease" "Partial Response" "Progressive Disease" ...
 $ TOTAE     : int  1 2 NA 4 4 2 1 4 2 4 ...
 $ SAECOUNT  : int  0 0 NA 1 1 0 0 2 1 0 ...
 $ AEDECOD_MS: chr  "Nausea" "Fatigue" NA "Dizziness" ...
 $ AEBODSYS  : chr  "Gastrointestinal" "Nervous System" NA "Gastrointestinal" ...
 $ AESER     : chr  "N" "N" NA "N" ...
 $ AGE       : num  70.5 79.4 76.3 77.4 72.1 ...
 $ SEX       : chr  "Female" "Female" "Male" "Male" ...
 $ RACE      : chr  "White" "White" "White" "White" ...
 $ ETHNIC    : chr  "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" "Not Hispanic or Latino" ...
 $ LDH       : num  140 116 173 145 162 ...
 $ ALB       : num  155 128 159 171 131 ...
 $ HGB       : num  136 132 143 130 164 ...
 $ EXTRT     : chr  "Dexamethasone" "Daratumumab" "Dexamethasone" "Lenalidomide" ...
 $ EXDOSE    : int  110 30 60 150 130 20 170 80 20 140 ...
 $ BMI       : num  26.8 25.4 32 22 20.4 ...
 $ ECOG      : int  4 1 2 1 1 1 4 0 2 1 ...
 $ CRP       : num  12.83 6.7 11.96 2.1 7.23 ...
Code
numerical_columns <- c("LDH", "ALB", "HGB", "EXDOSE", "BMI", "ECOG", "CRP", "TOTAE", "SAECOUNT", "AGE")

# create quintile columns for numerical columns
data <- data %>% mutate(
  LDH_quantile = ntile(LDH, 5),
  ALB_quantile = ntile(ALB, 5),
  HGB_quantile = ntile(HGB, 5),
  EXDOSE_quantile = ntile(EXDOSE, 5),
  BMI_quantile = ntile(BMI, 5),
  ECOG_quantile = ntile(ECOG, 5),
  CRP_quantile = ntile(CRP, 5),
  TOTAE_quantile = ntile(TOTAE, 5),
  SAECOUNT_quantile = ntile(SAECOUNT, 5),
  AGE_quantile = ntile(AGE, 5)
)

numercial_quntile_columns <- c(
  "LDH_quantile",
  "ALB_quantile",
  "HGB_quantile",
  "EXDOSE_quantile",
  "BMI_quantile",
  "ECOG_quantile",
  "CRP_quantile",
  "TOTAE_quantile",
  "SAECOUNT_quantile",
  "AGE_quantile"
)
Code
# Get the index of the specific column
categorical_columns <- c("SEX", "RACE", "ETHNIC","EXTRT", "RSSTRESC", "AEDECOD_MS", "AEBODSYS", "AESER", "TRT01P")
Code
# combine all the columns
test_columns <- c(numercial_quntile_columns, categorical_columns)
test_columns
  1. 'LDH_quantile'
  2. 'ALB_quantile'
  3. 'HGB_quantile'
  4. 'EXDOSE_quantile'
  5. 'BMI_quantile'
  6. 'ECOG_quantile'
  7. 'CRP_quantile'
  8. 'TOTAE_quantile'
  9. 'SAECOUNT_quantile'
  10. 'AGE_quantile'
  11. 'SEX'
  12. 'RACE'
  13. 'ETHNIC'
  14. 'EXTRT'
  15. 'RSSTRESC'
  16. 'AEDECOD_MS'
  17. 'AEBODSYS'
  18. 'AESER'
  19. 'TRT01P'
Code
# doing the above for all the names left
# initialize the p value table
p_values <- data.frame(variable = character(), pval = numeric())
plots <- list()
for (column in test_columns) {
  km_fit <- survfit(Surv(AVAL, 1 - CNSR) ~ get(column), data = data)
  plot1 <- ggsurvplot(km_fit, data = data, risk.table = TRUE, pval = TRUE, conf.int = TRUE,
                      xlab = "Time (days)", ylab = "Survival Probability",
                      title = paste("Kaplan-Meier Survival Curve by", column))
  
  # store p value in a variable
  p_1 <- surv_pvalue(km_fit, data=data)
  
  # store the plot into a list
  plots <- c(plots, list(plot1))
  p_1$variable <- column
  # stack the p value
  p_values <- rbind(p_values, p_1)
}

# show the plots
for (plot in plots) {
  print(plot)
}
print(p_values)

            variable       pval   method pval.txt
1       LDH_quantile 0.82818591 Log-rank p = 0.83
2       ALB_quantile 0.53685880 Log-rank p = 0.54
3       HGB_quantile 0.45719761 Log-rank p = 0.46
4    EXDOSE_quantile 0.45194543 Log-rank p = 0.45
5       BMI_quantile 0.27818407 Log-rank p = 0.28
6      ECOG_quantile 0.22569470 Log-rank p = 0.23
7       CRP_quantile 0.92108310 Log-rank p = 0.92
8     TOTAE_quantile 0.06034994 Log-rank p = 0.06
9  SAECOUNT_quantile 0.97553802 Log-rank p = 0.98
10      AGE_quantile 0.27471230 Log-rank p = 0.27
11               SEX 0.17715902 Log-rank p = 0.18
12              RACE 0.34587291 Log-rank p = 0.35
13            ETHNIC 0.33172974 Log-rank p = 0.33
14             EXTRT 0.92144158 Log-rank p = 0.92
15          RSSTRESC 0.44755898 Log-rank p = 0.45
16        AEDECOD_MS 0.40835738 Log-rank p = 0.41
17          AEBODSYS 0.90328591 Log-rank  p = 0.9
18             AESER 0.78180192 Log-rank p = 0.78
19            TRT01P 0.47909679 Log-rank p = 0.48

We used Kaplan-Meier curves to identify potential predictors of PFS. The following variables were selected based on the log-rank test.

Features List

Key Feature - Treatment Group

Bio-Markers - LDH (Lactate Dehydrogenase) - ALB (Albumin) - HGB (Hemoglobin)

Demographic Factors - SEX

Clinical Characteristics - TOTAE (Count of adverse events) - ECOG (Eastern Cooperative Oncology Group performance status)